Note: bin metabat.75_sub.contigs got caught in the data and needs ot be removed. Looks contaminated in GC coverage plot

set up working space

rm(list=ls())
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Prepare data

df.all<-read.csv('Data/02_Metagenomics/mag_visualization/August2019_annotated_covstats_all_bins.csv')
head(df.all)
tail(df.all)
length(unique(df.all$name))
## [1] 126
df.all<-df.all[!df.all$name=='metabat.75_sub.contigs',]
#all bins
bins.all<-read.table('Data/02_Metagenomics/mag_visualization/all_bins.tsv')
head(bins.all)
colnames(bins.all)<-c('Name','Contig')
bin_Freq.all<-data.frame(table(bins.all$Name))
length(unique(bins.all$Name))
## [1] 126
bins.all<-bins.all[!bins.all$Name=='metabat.75_sub.contigs',]

#sanity check
length(unique(bins.all$Contig))
## [1] 84223
dim(bins.all)
## [1] 84223     2

Manipulate data

#Normalize Avg_fold values to num contigs per bin
df.all$Total<-bin_Freq.all[match(df.all$name, bin_Freq.all$Var1),'Freq']
df.all$Norm_Avg_Fold<-df.all$Avg_fold/df.all$Total
#Add in habitat 
lookup<-data.frame(libs=c('3847_A', '3847_B', '3847_C', '3847_D', '3847_E', '3847_F', '3847_G','3847_H', '3847_I'), habitat=rep(c('Out','Edge','In'),c(3,3,3)))
lookup
df.all$habitat<-lookup[match(df.all$library, lookup$libs),'habitat']
df.all$habitat<-factor(x = df.all$habitat, ordered = T, levels=c('In','Edge','Out'))
head(df.all)
summary(df.all$Length) #gives summary about contig lengths
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1001    2955    4364    6740    7534  369755

Calculate average coverage for each bin across all contigs per habitat and per sample

# reduce dataset to only look at contigs with 5000 bp
df.reduced.all<-df.all[df.all$Length > 5000,]
s.libs.all<-df.reduced.all %>% group_by(library,name) %>% summarise(mean=mean(Avg_fold),se=sd(Avg_fold)/sqrt(length(Avg_fold)), mean_norm_cov=mean(Norm_Avg_Fold))
head(s.libs.all)

Rename libraries

lib_lookup<-data.frame(library=unique(df.reduced.all$library), letter=c('A','B','C','D','E','F','G','H','I'))

df.reduced.all$lib2<-lib_lookup[match(df.reduced.all$library, lib_lookup$library),'letter']
df.reduced.all$lib2<-factor(df.reduced.all$lib2, ordered=T, levels=rev(c('A','B','C','D','E','F','G','H','I')))
s.libs.all$lib2<-lib_lookup[match(s.libs.all$library, lib_lookup$library),'letter']

Visualize coverage plots

in_edge_out_colors<-c('forestgreen','goldenrod1','lightblue2')
theme_new<- theme_bw() + theme(panel.grid.major = element_line(color='gray90'), panel.grid.minor = element_line(color='gray90'))

split into 25 plot slices

bins<-sort(unique(df.reduced.all$name))
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[1:25],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[1:25],], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice1.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[26:50],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[26:50],], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice2.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[51:75],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[51:75],], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice3.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[76:100],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[76:100],], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice4.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[101:126],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[101:126],], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice5.pdf', width=20, height=20)

Identify bins that only occur in single samples

head(s.libs.all)
summary(s.libs.all$mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1604  0.8401  1.8076  2.2550 64.3580
s.libs.all$presence<-ifelse(s.libs.all$mean>0.5, yes=T, no=F)
s.pres<-s.libs.all[s.libs.all$presence==T,]
num.libs.perbin<-s.pres %>% group_by(name) %>% count()
singletons<-num.libs.perbin[num.libs.perbin$n==1,]
singletons 

Confirm with visualization

ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% singletons$name,], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% singletons$name,], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

remove<-c('72.contigs','metabat.229.contigs','metabat.295.contigs','metabat.372.contigs', 'metabat.52.contigs')
singletons<-singletons[!singletons$name %in% remove,]
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% singletons$name,], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat))  +
  geom_boxplot(data=s.libs.all[s.libs.all$name %in% singletons$name,], aes(x=lib2, y=log(mean+1))) + 
  facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())

df.analysis<-df.reduced.all[!df.reduced.all$name %in% singletons$name,] #removes bins that only occur in a single library
df.mcov.analysis<-s.libs.all[!s.libs.all$name %in% singletons$name,]

Analysis

For each bin, compare average coverage value of the contigs across replicates

set up data

habitat_lookup<-data.frame(letter=c('A','B','C','D','E','F','G','H','I'),habitat=rep(c('Out','Edge','In'),c(3,3,3)))
df.mcov.analysis$location<-habitat_lookup[match(df.mcov.analysis$lib2, habitat_lookup$letter),'habitat']
head(df.mcov.analysis) 
vars<-unique(df.mcov.analysis$name); length(vars)
## [1] 119

preform initial model test and check for assumptions of normality

mod<-aov(log(mean) ~ location, data=df.mcov.analysis[df.mcov.analysis$name==vars[1],])
summary(mod)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## location     2 27.627  13.814   56.52 0.000128 ***
## Residuals    6  1.466   0.244                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(mod)

models<-df.mcov.analysis %>% group_by(name) %>% do(mod = summary(aov(log(mean+1) ~ location, data = .)))
results<-data.frame(bin=models$name,
  p.value=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Pr(>F)1']),
  SumSq=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Sum Sq1']),
  Df1=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Df1']), 
  F_value=as.vector(unlist(models$mod)[names(unlist(models$mod))=='F value1']),
  mean_sq=as.vector(unlist(models$mod)[names(unlist(models$mod))=="Mean Sq1"]))
results$p.adjust<-p.adjust(results$p.value, method = 'BH')
head(results)
sig.taxa<-results[which(results$p.adjust < 0.1),];length(sig.taxa$bin)
## [1] 91
ns.taxa<-results[which(results$p.adjust > 0.1),]
write.csv(results,'Data/02_Metagenomics/mag_visualization/Aug19_bin_abundance_model_results.csv')

Check for pairwise differences

sig.mods<-df.mcov.analysis[df.mcov.analysis$name %in% sig.taxa$bin, ]; dim(sig.mods)
## [1] 819   8
models<-sig.mods %>% group_by(name) %>% do(mod = aov(log(mean+1) ~ location, data = .))
head(models)

For all models with adjusted pvalue < 0.1, went through and calculated posthoc tests to determine which groups were different. All other bins, visualually assessed which habitat they occured in. Notes of results are in file called: Results.xlsx

library(agricolae)
tuk.mod<-data.frame()
for (i in 1:length(models$name)){
tmp<-models$mod[[i]]
hsd<-HSD.test(y = tmp, trt = 'location')
tmp2<-data.frame( bin=models$name[i],hsd$groups)
tmp2$location<-rownames(tmp2)
tuk.mod<-rbind(tuk.mod,tmp2)
}
head(tuk.mod); tail(tuk.mod)
write.csv(tuk.mod,'Data/02_Metagenomics/mag_visualization/Aug19_tukey_bin_results.csv')